Title: “PCA analysis on social media data” Author: “” Date: “3/3/2023”

library(readr)
social_media <- read_csv("/Users/pallemkrishnaniveditha/Downloads/social_media.csv")
## Rows: 1000 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): gender, platform, interests, location, demographics, profession
## dbl (5): age, time_spent, income, indebt_n, Owns_car_n
## lgl (1): isHomeOwner
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(social_media)
## spc_tbl_ [1,000 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age         : num [1:1000] 56 46 32 60 25 38 56 36 40 28 ...
##  $ gender      : chr [1:1000] "male" "female" "male" "non-binary" ...
##  $ time_spent  : num [1:1000] 3 2 8 5 1 3 8 4 7 2 ...
##  $ platform    : chr [1:1000] "Instagram" "Facebook" "Instagram" "Instagram" ...
##  $ interests   : chr [1:1000] "Sports" "Travel" "Sports" "Travel" ...
##  $ location    : chr [1:1000] "United Kingdom" "United Kingdom" "Australia" "United Kingdom" ...
##  $ demographics: chr [1:1000] "Urban" "Urban" "Sub_Urban" "Urban" ...
##  $ profession  : chr [1:1000] "Software Engineer" "Student" "Marketer Manager" "Student" ...
##  $ income      : num [1:1000] 19774 10564 13258 12500 14566 ...
##  $ indebt_n    : num [1:1000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ isHomeOwner : logi [1:1000] FALSE TRUE FALSE TRUE TRUE TRUE ...
##  $ Owns_car_n  : num [1:1000] 0 1 0 1 1 1 1 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   gender = col_character(),
##   ..   time_spent = col_double(),
##   ..   platform = col_character(),
##   ..   interests = col_character(),
##   ..   location = col_character(),
##   ..   demographics = col_character(),
##   ..   profession = col_character(),
##   ..   income = col_double(),
##   ..   indebt_n = col_double(),
##   ..   isHomeOwner = col_logical(),
##   ..   Owns_car_n = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
attach(social_media)
# Converting categorical columns to numeric
social_media$gender <- as.numeric(factor(social_media$gender))
social_media$platform <- as.numeric(factor(social_media$platform))

# Selecting only numeric columns
numericCols <- social_media[, sapply(social_media, is.numeric)]

# Performing PCA on numeric columns, scaling the data
social_media_pca <- prcomp(numericCols, scale. = FALSE)

# Display the PCA object
print(social_media_pca)
## Standard deviations (1, .., p=7):
## [1] 2958.6284561   13.4464979    2.5365388    0.8170947    0.7950003
## [6]    0.4991299    0.0000000
## 
## Rotation (n x k) = (7 x 7):
##                      PC1           PC2           PC3           PC4
## age        -3.987055e-04  0.9999776139  6.563718e-03  1.221494e-03
## gender     -9.272301e-06 -0.0010506288 -6.560843e-03  9.244886e-01
## time_spent  4.080700e-06 -0.0065636621  9.998861e-01  2.261760e-03
## platform   -1.905031e-06  0.0006282806 -1.030351e-02 -3.806174e-01
## income      9.999999e-01  0.0003987151 -1.549831e-06  8.346483e-06
## indebt_n    0.000000e+00  0.0000000000  0.000000e+00  0.000000e+00
## Owns_car_n  1.026553e-06 -0.0001803942  5.965418e-03 -2.108672e-02
##                      PC5           PC6 PC7
## age         9.519345e-05 -1.710497e-04   0
## gender     -3.811306e-01 -4.026037e-03   0
## time_spent -1.177552e-02  6.404328e-03   0
## platform   -9.235499e-01  4.560073e-02   0
## income     -5.165563e-06  9.806636e-07   0
## indebt_n    0.000000e+00  0.000000e+00   1
## Owns_car_n -4.069904e-02 -9.989311e-01   0
# Summary of PCA to see explained variance
summary(social_media_pca)
## Importance of components:
##                         PC1      PC2   PC3    PC4   PC5    PC6 PC7
## Standard deviation     2959 13.44650 2.537 0.8171 0.795 0.4991   0
## Proportion of Variance    1  0.00002 0.000 0.0000 0.000 0.0000   0
## Cumulative Proportion     1  1.00000 1.000 1.0000 1.000 1.0000   1
#INFERENCE

#We have considered the numeric values and we have considered the columns containing numeric values and later performed PCA on the numeric values and have obtained PC1,PC2,PC3,PC4,PC5,PC6 and PC7. We got the Standard deviation, Proportion of Variance and Cumulative Proportion.
# Convert factors to numeric as needed
social_media$gender <- as.numeric(factor(social_media$gender))
social_media$platform <- as.numeric(factor(social_media$platform))
social_media$location <- as.numeric(factor(social_media$location))
social_media$demographics <- as.numeric(factor(social_media$demographics))
social_media$profession <- as.numeric(factor(social_media$profession))
social_media$interests <- as.numeric(factor(social_media$interests))
social_media$isHomeOwner <- as.numeric(social_media$isHomeOwner) - 1 # Convert TRUE/FALSE to 1/0
numericCols <- social_media[, sapply(social_media, is.numeric)]
social_media_pca <- prcomp(numericCols, scale. = FALSE)
# Display the PCA object
print(social_media_pca)
## Standard deviations (1, .., p=12):
##  [1] 2.958628e+03 1.344658e+01 2.537088e+00 8.420396e-01 8.212140e-01
##  [6] 8.181288e-01 8.095530e-01 7.995478e-01 7.856290e-01 7.027667e-01
## [11] 7.690127e-16 3.113997e-19
## 
## Rotation (n x k) = (12 x 12):
##                        PC1           PC2           PC3           PC4
## age          -3.987055e-04  0.9999712486  6.613825e-03  2.900275e-03
## gender       -9.272301e-06 -0.0010511450 -6.542692e-03  5.168512e-01
## time_spent    4.080700e-06 -0.0065652756  9.996442e-01 -3.457703e-03
## platform     -1.905031e-06  0.0006280188 -1.024693e-02  7.448229e-03
## interests    -6.221147e-06 -0.0009324409 -3.202427e-03  5.581186e-01
## location     -5.789312e-06  0.0016060967 -1.595995e-02 -5.009750e-01
## demographics  1.302205e-05  0.0013818442 -1.293014e-02  3.517446e-02
## profession    1.241762e-05  0.0027050067 -3.091270e-03 -4.109357e-01
## income        9.999999e-01  0.0003986646 -1.440856e-06  1.121479e-05
## indebt_n      0.000000e+00  0.0000000000  0.000000e+00 -2.168404e-19
## isHomeOwner   1.026553e-06 -0.0001807698  6.250264e-03 -1.027919e-02
## Owns_car_n    1.026553e-06 -0.0001807698  6.250264e-03 -1.027919e-02
##                        PC5           PC6           PC7           PC8
## age           9.166670e-04 -1.497893e-03  2.470275e-04 -3.450456e-04
## gender       -3.517460e-01 -3.593017e-01  4.223560e-01 -7.101052e-02
## time_spent   -1.657930e-02  5.036187e-04 -3.551447e-03  1.410835e-02
## platform      3.750819e-01 -1.117133e-01 -4.031190e-01  5.925378e-01
## interests    -7.041350e-02  5.595109e-01  1.638633e-01  5.190607e-01
## location     -5.127703e-01 -2.485447e-01  2.475404e-01  5.862221e-01
## demographics -6.710025e-01  2.731499e-01 -6.442133e-01 -1.270491e-01
## profession    3.209752e-02  6.350209e-01  3.914263e-01 -1.015393e-01
## income        2.634320e-06 -1.343427e-05  9.318204e-06  9.720275e-06
## indebt_n      5.551115e-17  5.551115e-17  2.220446e-16  5.551115e-17
## isHomeOwner   8.985118e-02 -5.352827e-02 -3.704715e-02  4.541631e-02
## Owns_car_n    8.985118e-02 -5.352827e-02 -3.704715e-02  4.541631e-02
##                        PC9          PC10          PC11          PC12
## age          -1.376761e-03  1.710064e-04 -1.679048e-19  1.899616e-24
## gender        5.445745e-01  3.069017e-03 -4.361887e-17  7.194079e-18
## time_spent    9.546579e-03 -8.855509e-03  7.109285e-18 -1.424806e-19
## platform      5.522284e-01 -1.676626e-01 -1.211465e-17 -5.840796e-18
## interests    -2.658101e-01  5.959807e-02  7.816299e-17  1.643004e-17
## location     -1.356037e-01  2.716410e-02  2.071364e-17  7.687777e-18
## demographics  1.959784e-01  6.464175e-02  2.763692e-16 -2.875111e-17
## profession    5.127634e-01  1.954418e-02 -1.369295e-16  1.952284e-17
## income       -5.974391e-06 -2.167677e-06 -1.347762e-21  3.359005e-22
## indebt_n     -1.734723e-18 -1.355253e-20 -1.131848e-05 -1.000000e+00
## isHomeOwner   6.329362e-02  6.938844e-01  7.071068e-01 -8.003377e-06
## Owns_car_n    6.329362e-02  6.938844e-01 -7.071068e-01  8.003377e-06
# Summary of PCA to see explained variance
summary(social_media_pca)
## Importance of components:
##                         PC1      PC2   PC3   PC4    PC5    PC6    PC7    PC8
## Standard deviation     2959 13.44658 2.537 0.842 0.8212 0.8181 0.8096 0.7995
## Proportion of Variance    1  0.00002 0.000 0.000 0.0000 0.0000 0.0000 0.0000
## Cumulative Proportion     1  1.00000 1.000 1.000 1.0000 1.0000 1.0000 1.0000
##                           PC9   PC10     PC11      PC12
## Standard deviation     0.7856 0.7028 7.69e-16 3.114e-19
## Proportion of Variance 0.0000 0.0000 0.00e+00 0.000e+00
## Cumulative Proportion  1.0000 1.0000 1.00e+00 1.000e+00
(eigen_values <- social_media_pca$sdev^2)
##  [1] 8.753482e+06 1.808106e+02 6.436815e+00 7.090307e-01 6.743924e-01
##  [6] 6.693348e-01 6.553761e-01 6.392767e-01 6.172129e-01 4.938810e-01
## [11] 5.913805e-31 9.696976e-38
names(eigen_values) <- paste("PC", 1:length(eigen_values), sep="")

sum_lambdas <- sum(eigen_values)

prop_var <- eigen_values / sum_lambdas

cum_var <- cumsum(prop_var)

mat_lambdas <- rbind(eigen_values, prop_var, cum_var)
rownames(mat_lambdas) <- c("Eigenvalues", "Prop. variance", "Cum. prop. variance")

# Display rounded lambda matrix
round(mat_lambdas, 4)
##                         PC1      PC2    PC3   PC4    PC5    PC6    PC7    PC8
## Eigenvalues         8753482 180.8106 6.4368 0.709 0.6744 0.6693 0.6554 0.6393
## Prop. variance            1   0.0000 0.0000 0.000 0.0000 0.0000 0.0000 0.0000
## Cum. prop. variance       1   1.0000 1.0000 1.000 1.0000 1.0000 1.0000 1.0000
##                        PC9   PC10 PC11 PC12
## Eigenvalues         0.6172 0.4939    0    0
## Prop. variance      0.0000 0.0000    0    0
## Cum. prop. variance 1.0000 1.0000    1    1
# Display summary and rotation matrix
summary(social_media_pca)
## Importance of components:
##                         PC1      PC2   PC3   PC4    PC5    PC6    PC7    PC8
## Standard deviation     2959 13.44658 2.537 0.842 0.8212 0.8181 0.8096 0.7995
## Proportion of Variance    1  0.00002 0.000 0.000 0.0000 0.0000 0.0000 0.0000
## Cumulative Proportion     1  1.00000 1.000 1.000 1.0000 1.0000 1.0000 1.0000
##                           PC9   PC10     PC11      PC12
## Standard deviation     0.7856 0.7028 7.69e-16 3.114e-19
## Proportion of Variance 0.0000 0.0000 0.00e+00 0.000e+00
## Cumulative Proportion  1.0000 1.0000 1.00e+00 1.000e+00
social_media_pca$rotation
##                        PC1           PC2           PC3           PC4
## age          -3.987055e-04  0.9999712486  6.613825e-03  2.900275e-03
## gender       -9.272301e-06 -0.0010511450 -6.542692e-03  5.168512e-01
## time_spent    4.080700e-06 -0.0065652756  9.996442e-01 -3.457703e-03
## platform     -1.905031e-06  0.0006280188 -1.024693e-02  7.448229e-03
## interests    -6.221147e-06 -0.0009324409 -3.202427e-03  5.581186e-01
## location     -5.789312e-06  0.0016060967 -1.595995e-02 -5.009750e-01
## demographics  1.302205e-05  0.0013818442 -1.293014e-02  3.517446e-02
## profession    1.241762e-05  0.0027050067 -3.091270e-03 -4.109357e-01
## income        9.999999e-01  0.0003986646 -1.440856e-06  1.121479e-05
## indebt_n      0.000000e+00  0.0000000000  0.000000e+00 -2.168404e-19
## isHomeOwner   1.026553e-06 -0.0001807698  6.250264e-03 -1.027919e-02
## Owns_car_n    1.026553e-06 -0.0001807698  6.250264e-03 -1.027919e-02
##                        PC5           PC6           PC7           PC8
## age           9.166670e-04 -1.497893e-03  2.470275e-04 -3.450456e-04
## gender       -3.517460e-01 -3.593017e-01  4.223560e-01 -7.101052e-02
## time_spent   -1.657930e-02  5.036187e-04 -3.551447e-03  1.410835e-02
## platform      3.750819e-01 -1.117133e-01 -4.031190e-01  5.925378e-01
## interests    -7.041350e-02  5.595109e-01  1.638633e-01  5.190607e-01
## location     -5.127703e-01 -2.485447e-01  2.475404e-01  5.862221e-01
## demographics -6.710025e-01  2.731499e-01 -6.442133e-01 -1.270491e-01
## profession    3.209752e-02  6.350209e-01  3.914263e-01 -1.015393e-01
## income        2.634320e-06 -1.343427e-05  9.318204e-06  9.720275e-06
## indebt_n      5.551115e-17  5.551115e-17  2.220446e-16  5.551115e-17
## isHomeOwner   8.985118e-02 -5.352827e-02 -3.704715e-02  4.541631e-02
## Owns_car_n    8.985118e-02 -5.352827e-02 -3.704715e-02  4.541631e-02
##                        PC9          PC10          PC11          PC12
## age          -1.376761e-03  1.710064e-04 -1.679048e-19  1.899616e-24
## gender        5.445745e-01  3.069017e-03 -4.361887e-17  7.194079e-18
## time_spent    9.546579e-03 -8.855509e-03  7.109285e-18 -1.424806e-19
## platform      5.522284e-01 -1.676626e-01 -1.211465e-17 -5.840796e-18
## interests    -2.658101e-01  5.959807e-02  7.816299e-17  1.643004e-17
## location     -1.356037e-01  2.716410e-02  2.071364e-17  7.687777e-18
## demographics  1.959784e-01  6.464175e-02  2.763692e-16 -2.875111e-17
## profession    5.127634e-01  1.954418e-02 -1.369295e-16  1.952284e-17
## income       -5.974391e-06 -2.167677e-06 -1.347762e-21  3.359005e-22
## indebt_n     -1.734723e-18 -1.355253e-20 -1.131848e-05 -1.000000e+00
## isHomeOwner   6.329362e-02  6.938844e-01  7.071068e-01 -8.003377e-06
## Owns_car_n    6.329362e-02  6.938844e-01 -7.071068e-01  8.003377e-06
#Step 1: Run PCA on the Numeric Columns
#First, ensure you have converted categorical columns to numeric where appropriate and extracted the numeric columns for PCA. After running PCA, you obtain social_media_pca.

#Step 2: Combine PCA Scores with a Categorical Variable

# Assuming platform has been converted to numeric and you wish to analyze based on it
# Let's convert it back to a factor for meaningful grouping

social_media$platform <- factor(social_media$platform)

# Combining the platform data with PCA scores
platform_pca <- cbind(social_media$platform, social_media_pca$x)
platform_pca_df <- data.frame(platform_pca)

# Renaming the first column to Platform for clarity
colnames(platform_pca_df)[1] <- "Platform"

#Step 3: Calculate Means of Scores for All PCs Classified by Platform

tabMeansPC <- aggregate(platform_pca_df[,2:ncol(platform_pca_df)], by=list(Platform=platform_pca_df$Platform), mean)

# Order by Platform for clarity (optional, depending on your needs)
tabMeansPC <- tabMeansPC[order(tabMeansPC$Platform),]

# Transpose the means for better readability
tabMeansTransposed <- t(tabMeansPC[,-1])

# Rename columns to reflect the platforms
colnames(tabMeansTransposed) <- as.vector(tabMeansPC$Platform)

# Display the transposed table of means
tabMeansTransposed
##                  1             2             3
## PC1   2.354539e+02 -3.523684e+02  1.685617e+02
## PC2  -4.602901e-02 -2.346476e-01  3.009333e-01
## PC3   3.441383e-02  1.233100e-01 -1.676563e-01
## PC4   4.408186e-02 -8.909643e-02  5.699659e-02
## PC5  -4.390618e-01  4.651383e-02  3.572953e-01
## PC6   1.268585e-01 -8.794201e-03 -1.083432e-01
## PC7   4.305730e-01 -1.216235e-03 -3.992255e-01
## PC8  -6.072645e-01 -1.530680e-02  5.817775e-01
## PC9  -5.097721e-01 -7.576195e-02  5.575807e-01
## PC10  1.713704e-01 -6.198036e-02 -9.124804e-02
## PC11  2.279510e-17  3.660601e-17 -5.953686e-17
## PC12  6.168522e-18  2.039322e-19 -5.962915e-18
# Calculate standard deviations of scores for all the PCs classified by platform
tabsdsPC <- aggregate(platform_pca_df[,2:ncol(platform_pca_df)], by=list(Platform=platform_pca_df$Platform), sd)

# Transpose the standard deviations for better readability
tabfsds <- t(tabsdsPC[,-1])

# Rename columns to reflect the platforms
colnames(tabfsds) <- as.vector(tabsdsPC$Platform)

# Display the transposed table of standard deviations
tabfsds
##                 1            2            3
## PC1  2.929997e+03 2.971777e+03 2.942870e+03
## PC2  1.309473e+01 1.383220e+01 1.337415e+01
## PC3  2.494991e+00 2.616969e+00 2.484731e+00
## PC4  8.492569e-01 8.395054e-01 8.323739e-01
## PC5  7.156944e-01 7.555257e-01 7.955948e-01
## PC6  7.978562e-01 8.309905e-01 8.084922e-01
## PC7  7.409471e-01 7.296836e-01 7.488011e-01
## PC8  6.189056e-01 6.319315e-01 6.794303e-01
## PC9  6.599047e-01 6.378256e-01 6.788435e-01
## PC10 6.982697e-01 6.914581e-01 6.928906e-01
## PC11 6.039892e-16 6.129772e-16 6.374764e-16
## PC12 3.204325e-17 3.188947e-17 3.300782e-17
# ANOVA for PC1 scores across different platforms
anova_result_PC1 <- aov(PC1 ~ Platform, data=platform_pca_df)
summary(anova_result_PC1)
##              Df    Sum Sq Mean Sq F value Pr(>F)
## Platform      1 4.360e+05  436032    0.05  0.824
## Residuals   998 8.744e+09 8761816
## F ratio tests
pca_scores <- as.data.frame(social_media_pca$x)

# Add the grouping variable back to this data frame
pca_scores_with_group <- cbind(social_media$gender, pca_scores)
colnames(pca_scores_with_group)[1] <- "Gender"

# Subset data to include only groups 1 and 2
pca_scores_filtered <- pca_scores_with_group[pca_scores_with_group$Gender %in% c(1, 2), ]

# Then perform the variance test
var_test_result_PC1 <- var.test(PC1 ~ Gender, data = pca_scores_filtered)
print(var_test_result_PC1)
## 
##  F test to compare two variances
## 
## data:  PC1 by Gender
## F = 0.99231, num df = 330, denom df = 336, p-value = 0.9442
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.8002746 1.2306902
## sample estimates:
## ratio of variances 
##          0.9923114
#Inference 
#We have taken PC1 by gender, here using F test to compare two variances. The p value obtained is 0.9442 which is greater than 0.05, That means the null hypothesis should not be rejected.  
# Levene's tests (one-sided)
# Install the car package if you haven't already
if (!requireNamespace("car", quietly = TRUE)) {
  install.packages("car")
}

# Load the car package for Levene's test
library(car)
## Loading required package: carData
# Levene's test for equality of variances in PC1 scores across gender groups
(LTPC1 <- leveneTest(PC1 ~ gender, data = pca_scores_with_group))
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   2  0.0911  0.913
##       997
(p_PC1_1sided <- LTPC1$p.value / 2)  # If you're interested in a one-sided test
## numeric(0)
# Scatter plot for PC1 and PC2 scores, differentiating by gender
plot(social_media_pca$x[,1], social_media_pca$x[,2], 
     pch = ifelse(social_media$gender == 1, 1, 16), 
     xlab = "PC1", ylab = "PC2", 
     main = "PCA Scores for Social Media Data")
abline(h = 0)
abline(v = 0)
legend("bottomleft", legend = c("Gender 1", "Gender 2"), pch = c(1, 16))

# Scree plot
plot(social_media_pca$sdev^2, xlab = "Component number", ylab = "Eigenvalue", type = "b", main = "Scree Plot")

# Log of eigenvalues diagram
plot(log(social_media_pca$sdev^2), xlab = "Component number", ylab = "Log(Eigenvalue)", type = "b", main = "Log(Eigenvalue) Diagram")

print(summary(social_media_pca))
## Importance of components:
##                         PC1      PC2   PC3   PC4    PC5    PC6    PC7    PC8
## Standard deviation     2959 13.44658 2.537 0.842 0.8212 0.8181 0.8096 0.7995
## Proportion of Variance    1  0.00002 0.000 0.000 0.0000 0.0000 0.0000 0.0000
## Cumulative Proportion     1  1.00000 1.000 1.000 1.0000 1.0000 1.0000 1.0000
##                           PC9   PC10     PC11      PC12
## Standard deviation     0.7856 0.7028 7.69e-16 3.114e-19
## Proportion of Variance 0.0000 0.0000 0.00e+00 0.000e+00
## Cumulative Proportion  1.0000 1.0000 1.00e+00 1.000e+00
# Direct plot of PCA object (biplot or scree plot depending on the PCA object)
plot(social_media_pca)

#Inference

#We can see the graphstating the PCA scores for the social media data, where the gender 1 is male and gender 2 is female, it looks like the gender-2 is comparatively distributed more across all quadrants than the gender 1. 
# Better Ways to Visualize

library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
library(ggfortify)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:car':
## 
##     logit
library(corrplot)
## corrplot 0.92 loaded
library(devtools)
## Loading required package: usethis
# For demonstration, let's assume 'gender' as a binary variable for coloring (adjust as needed)
color_var <- ifelse(social_media$gender == "male", "blue", "red")

# Visualize original data (adjust columns as necessary)
pairs.panels(social_media[, c("time_spent", "income")], # select numeric columns as needed
             gap = 0,
             bg = color_var,
             pch=21)

# Visualize PCA results
pairs.panels(social_media_pca$x,
             gap=0,
             bg = color_var,
             pch=21)

# If the 'factoextra' package is used for 'fviz_*' functions, ensure it's installed and loaded
library(factoextra)

# Eigenvalues plot
fviz_eig(social_media_pca, addlabels = TRUE)

# Variable correlation with PCA dimensions
fviz_pca_var(social_media_pca, col.var = "cos2",
             gradient.cols = c("#FFCC00", "#CC9933", "#660033", "#330033"),
             repel = TRUE)

# Individual's projection
fviz_pca_ind(social_media_pca, col.ind = "cos2", 
             gradient.cols = c("#FFCC00", "#CC9933", "#660033", "#330033"), 
             repel = TRUE)

# Biplot
biplot(social_media_pca)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

# autoplot from 'ggfortify' for PCA results visualization
# Ensure 'ggfortify' is installed and loaded for autoplot
library(ggfortify)
autoplot(social_media_pca,
         data = social_media[, -1], # assuming the first column isn't part of PCA
         loadings = TRUE,
         labels = social_media$gender) # Adjust this based on your specific grouping variable

# Convert factors to numeric where applicable
social_media$gender <- as.numeric(factor(social_media$gender))
social_media$platform <- as.numeric(factor(social_media$platform))
social_media$interests <- as.numeric(factor(social_media$interests))
social_media$location <- as.numeric(factor(social_media$location))
social_media$demographics <- as.numeric(factor(social_media$demographics))
social_media$profession <- as.numeric(factor(social_media$profession))
social_media$isHomeOwner <- as.numeric(social_media$isHomeOwner) # Assuming TRUE/FALSE values
social_media$Owns_car_n <- as.numeric(social_media$Owns_car_n) # Assuming 0/1 values

# Exclude non-numeric columns if not converted
numeric_data <- social_media[, sapply(social_media, is.numeric)]

# Load FactoMineR for PCA
library(FactoMineR)

# Run PCA
res.pca <- PCA(numeric_data, graph = FALSE)

# Print the results
print(res.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 1000 individuals, described by 12 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"
eig.val <- get_eigenvalue(res.pca)
eig.val
##          eigenvalue variance.percent cumulative.variance.percent
## Dim.1  2.007901e+00     1.825365e+01                    18.25365
## Dim.2  1.119569e+00     1.017790e+01                    28.43155
## Dim.3  1.100359e+00     1.000326e+01                    38.43481
## Dim.4  1.047873e+00     9.526115e+00                    47.96092
## Dim.5  1.013310e+00     9.211905e+00                    57.17283
## Dim.6  1.008609e+00     9.169176e+00                    66.34200
## Dim.7  9.754899e-01     8.868090e+00                    75.21009
## Dim.8  9.464825e-01     8.604387e+00                    83.81448
## Dim.9  9.194751e-01     8.358865e+00                    92.17334
## Dim.10 8.609321e-01     7.826655e+00                   100.00000
## Dim.11 1.701736e-30     1.547033e-29                   100.00000
## Dim.12 1.392648e-35     1.266043e-34                   100.00000
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))

var <- get_pca_var(res.pca)

var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"
# Coordinates
head(var$coord)
##                  Dim.1       Dim.2       Dim.3       Dim.4       Dim.5
## age        -0.01425355  0.65934084  0.06117281 -0.18254730  0.02973872
## gender     -0.01506543  0.03515050 -0.49426714  0.36282527 -0.38253904
## time_spent  0.05980970 -0.42086714 -0.15280845 -0.53394652 -0.10054958
## platform    0.08503092  0.16882790 -0.01681409  0.12828870  0.81864859
## interests  -0.03622925 -0.05650396 -0.34367438  0.33756612  0.18739139
## location   -0.02395865  0.42505377  0.26351332  0.03051799 -0.36238527
# Cos2: quality on the factore map
head(var$cos2)
##                   Dim.1       Dim.2        Dim.3        Dim.4        Dim.5
## age        0.0002031637 0.434730340 0.0037421122 0.0333235155 0.0008843917
## gender     0.0002269672 0.001235558 0.2443000104 0.1316421756 0.1463361155
## time_spent 0.0035771999 0.177129151 0.0233504210 0.2850988880 0.0101102188
## platform   0.0072302566 0.028502859 0.0002827138 0.0164579912 0.6701855108
## interests  0.0013125588 0.003192697 0.1181120762 0.1139508838 0.0351155334
## location   0.0005740169 0.180670705 0.0694392690 0.0009313479 0.1313230848
# Contributions to the principal components
head(var$contrib)
##                 Dim.1      Dim.2       Dim.3       Dim.4       Dim.5
## age        0.01011821 38.8301575  0.34008114  3.18011107  0.08727754
## gender     0.01130370  0.1103601 22.20185342 12.56280238 14.44140288
## time_spent 0.17815616 15.8211936  2.12207369 27.20739742  0.99774237
## platform   0.36009024  2.5458782  0.02569287  1.57060980 66.13827992
## interests  0.06536969  0.2851720 10.73396190 10.87449693  3.46543000
## location   0.02858791 16.1375255  6.31060338  0.08887987 12.95981904
# Correlation circle
fviz_pca_var(res.pca, col.var = "black")

# Quality of representation

if(any(is.na(var$cos2) | is.nan(var$cos2))) {
  print("There are NA or NaN values in var$cos2")
}
## [1] "There are NA or NaN values in var$cos2"
summary(var$cos2)
##      Dim.1               Dim.2               Dim.3          
##  Min.   :0.0001348   Min.   :0.0003713   Min.   :0.0000736  
##  1st Qu.:0.0002991   1st Qu.:0.0012165   1st Qu.:0.0020124  
##  Median :0.0013126   Median :0.0078166   Median :0.0694393  
##  Mean   :0.1825365   Mean   :0.1017790   Mean   :0.1000326  
##  3rd Qu.:0.0054037   3rd Qu.:0.1788999   3rd Qu.:0.1812060  
##  Max.   :0.9959484   Max.   :0.4347303   Max.   :0.2826657  
##  NA's   :1           NA's   :1           NA's   :1          
##      Dim.4               Dim.5         
##  Min.   :0.0008908   Min.   :0.000240  
##  1st Qu.:0.0086947   1st Qu.:0.001196  
##  Median :0.0351254   Median :0.010110  
##  Mean   :0.0952612   Mean   :0.092119  
##  3rd Qu.:0.1227965   3rd Qu.:0.083219  
##  Max.   :0.3625058   Max.   :0.670185  
##  NA's   :1           NA's   :1
fviz_cos2(res.pca, choice = "var", axes = 1:2)

fviz_pca_var(res.pca, col.var = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE # Avoid text overlapping
             )

# Change the transparency by cos2 values
fviz_pca_var(res.pca, alpha.var = "cos2")

corrplot(var$contrib, is.corr=FALSE)

# Contributions of variables to PC1
fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)

# Contributions of variables to PC2
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)

fviz_pca_var(res.pca, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
             )

fviz_pca_var(res.pca, alpha.var = "contrib")

# Assuming social_media$gender is already a factor
# If not, convert it to factor
social_media$gender <- factor(social_media$gender)
library(FactoMineR)
res.pca <- PCA(numeric_data, graph = FALSE)
library(factoextra)

# Check if the length matches
if(length(social_media$gender) == nrow(numeric_data)) {
  fviz_pca_ind(res.pca,
               geom.ind = "point", # show points only (not "text")
               col.ind = social_media$gender, # color by groups, ensure it matches PCA data
               palette = c("#00AFBB", "#E7B800", "#FC4E07"),
               addEllipses = TRUE, # Concentration ellipses
               legend.title = "Gender"
              )
} else {
  print("The length of the grouping variable does not match the PCA data.")
}

```